#!/usr/bin/python

def term_compare(x, y):
	if y[1]+y[2]*2 > x[1]+x[2]*2:
		return 1
	elif y[1]==x[1] and y[2]==x[2]:
		return 0
	else: # x<y
		return -1

def main():
	import re
	import simplejson as json
	from topia.termextract import extract
	from lxml import html
	from BeautifulSoup import BeautifulSoup
	extractor = extract.TermExtractor()
	extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2)
	#extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2,noLimitStrength=-10)
	#extractor.filter = extract.permissiveFilter
	f = open("/tmp/foo.txt","r")
	thread = f.read()
	#double unescape
	##thread = html.fromstring(html.fromstring(thread).text).text
	thread = re.sub("<.+?>", ' ', thread)
	thread = re.sub("http://\S+", ' ', thread)
	thread = re.sub("&gt;&gt;&gt;", ' ', thread)
	thread = re.sub("&gt;&gt;", ' ', thread)
	thread = BeautifulSoup(thread, convertEntities=BeautifulSoup.HTML_ENTITIES)
	thread = str(thread)
	#thread = html.fromstring(thread).text
	#thread = thread.encode('UTF-8')
	thread = thread.lower()
	terms = list(sorted(extractor(thread), term_compare))
	terms = terms[:50]
	print thread
	print "---------------------------------"
	print json.dumps(terms, indent=4)


if __name__ == "__main__":
	main()

